
**********
* Readme *
**********

* This script creates Table 1:  Overview of the population of interest usign summary statistics from POF and PNAD.

* Notes: [1] Data refers to the working-age population living in urban areas. [2] Individual observations are weighted by the inverse of their sampling probability, following the survey design, to render the coefficients meaningful for the population the sample represents. The standard errors around the point estimates are calculated via linearization, accounting for the stratification design, and the p-value was calculated based on the z-statistic of the difference between the estimates. [3] The POF survey conducted interviews between July 2017 and July 2018. In order to capture a similar time window, we use the 8 quarterly rounds of PNAD from 2017 and 2018: four of them overlapping with the data collection interval from POF, plus two quarters before, and two after it. [4] PNAD currently provides unique identifiers to households but not to household members. To track individuals across quarters, we adopt the advanced identification methodology proposed by \textcite{ribas_sobre_2008}, as implemented in Stata \parencite{statacorp_stata_2015} by the program \texttt{datazoom\_pnadcontinua} (version 1.0) from the Economics Department of PUC-Rio University.


* Root folder (PATH TO BE DEFINED BY THE USER)
**********************************************
clear all
global analysis "C:/***/replication_package"


* Timestamped log
*****************
global today = strofreal(date(c(current_date), "DMY"), "%tdYYNNDD")
log using "${analysis}/code/logs/3_2_descriptive_pop_stats_${today}.smcl", replace


* Merge both sources
********************
use "${analysis}/data/2_2_pof_clean.dta", clear
rename strata_id pof_strata_id
rename psu_id    pof_psu_id
rename hu_id     pof_hh_id
rename ind_id    pof_ind_id
rename pweight   pof_pweight

append using "${analysis}/data/2_4_pnad_clean.dta"
rename strata_id pnad_strata_id
rename psu_id    pnad_psu_id
rename hh_id     pnad_hh_id
rename ind_id    pnad_ind_id
rename pweight   pnad_pweight

order survey state region ///
      pof_strata_id   pof_psu_id  pof_hh_id  pof_ind_id  pof_pweight ///
      pnad_strata_id pnad_psu_id pnad_hh_id pnad_ind_id pnad_pweight

gen female = (male - 1)^(male) * 100
gen nonwhite = (white - 1)^(white) * 100

replace educ_1 = educ_1 * 100
replace educ_2 = educ_2 * 100
replace educ_3 = educ_3 * 100
replace educ_4 = educ_4 * 100

replace age_1 = age_1 * 100
replace age_2 = age_2 * 100
replace age_3 = age_3 * 100
replace age_4 = age_4 * 100
replace age_5 = age_5 * 100

gen pop = 1/10^6 // population in millions


********************************************************
*   TABLE SURVEYS: Descriptive for POF + pooled PNAD   *
********************************************************

global variables female nonwhite educ_1 educ_2 educ_3 educ_4 age_1 age_2 age_3 age_4 age_5 


* [1] POF
*********

svyset pof_psu_id [pweight = pof_pweight], strata(pof_strata_id) singleunit(centered) 
svy: mean $variables
matrix pof = r(table)
matrix pof = pof["b", 1...] \ pof["se", 1...]
matrix list pof

unique pof_hh_id if survey == "pof"
global N_hh = r(unique)
unique pof_ind_id if survey == "pof"
global N_ind = r(unique)
global N_obs = r(N)
svydescribe
matrix pof_survey = (r(N_strata), r(N_units), $N_hh, $N_ind, $N_obs \ ., ., ., ., .) 
matrix colnames pof_survey = "strata" "psu" "households" "individuals" "observations"
matrix list pof_survey

matrix empty_p = (., ., ., ., ., ., ., ., ., ., ., ., ., ., ., .)
matrix rownames empty_p = "p"
matrix list empty_p

matrix pof_table = (pof, pof_survey \ empty_p)
matrix coleq pof_table = "POF"
matrix list pof_table


* [2] Pooled PNAD
*****************

svyset pnad_psu_id [pweight = pnad_pweight], strata(pnad_strata_id) singleunit(centered)
svy: mean $variables
matrix pnad = r(table)
matrix pnad = pnad["b", 1...] \ pnad["se", 1...]
matrix list pnad

unique pnad_hh_id if survey == "pnad"
global N_hh = r(unique)
unique pnad_ind_id if survey == "pnad"
global N_ind = r(unique)
global N_obs = r(N)
svydescribe
matrix pnad_survey = (r(N_strata), r(N_units), $N_hh, $N_ind, $N_obs \ ., ., ., ., .) 
matrix colnames pnad_survey = "strata" "psu" "households" "individuals" "observations"
matrix list pnad_survey

matrix pnad_table = (pnad, pnad_survey \ empty_p)
matrix coleq pnad_table = "pnad"
matrix list pnad_table


* [3] Diff POF - PNAD
*********************

foreach variable of global variables  {

    scalar b_pof   =  pof[rownumb(matrix(pof),  "b"), colnumb(matrix(pof),  "`variable'")] 
    scalar b_pnad  = pnad[rownumb(matrix(pnad), "b"), colnumb(matrix(pnad), "`variable'")]

    scalar se_pof  =  pof[rownumb(matrix(pof),  "se"), colnumb(matrix(pof),  "`variable'")]
    scalar se_pnad = pnad[rownumb(matrix(pnad), "se"), colnumb(matrix(pnad), "`variable'")]

    scalar diff_`variable' = b_pof - b_pnad
    scalar z_`variable' = abs(diff_`variable') / sqrt(se_pof^2 + se_pnad^2)
    scalar p_`variable' = (1 - normal(abs(z_`variable'))) * 2
}

matrix diff = (diff_female, diff_nonwhite, diff_educ_1, diff_educ_2, diff_educ_3, diff_educ_4, diff_age_1, diff_age_2, diff_age_3, diff_age_4, diff_age_5)
matrix colnames diff = $variables
matrix list diff

matrix z = (z_female, z_nonwhite, z_educ_1, z_educ_2, z_educ_3, z_educ_4, z_age_1, z_age_2, z_age_3, z_age_4, z_age_5)
matrix colnames z = $variables
matrix list z

matrix p = (p_female, p_nonwhite, p_educ_1, p_educ_2, p_educ_3, p_educ_4, p_age_1, p_age_2, p_age_3, p_age_4, p_age_5)
matrix colnames p = $variables
matrix list p

matrix diff = diff \ ., ., ., ., ., ., ., ., ., ., .\ p 
matrix rownames diff = "b" "se" "p"
matrix list diff

matrix diff_survey = (., ., ., ., .\ ., ., ., ., .\ ., ., ., ., .) 
matrix colnames diff_survey = "strata" "psu" "households" "individuals" "observations"
matrix diff_table = diff, diff_survey
matrix coleq diff_table = "diff"
matrix list diff_table


* Summary
*********

matrix sum_table_surveys = pof_table, pnad_table, diff_table 
matrix list sum_table_surveys
estadd matrix sum_table_surveys

esttab, varwidth(50) ///
  cells("sum_table_surveys[b](fmt(%12.2fc %12.2fc %12.2fc %12.2fc %12.2fc %12.2fc %12.2fc %12.2fc %12.2fc %12.2fc %12.2fc %12.0gc)) sum_table_surveys[se](fmt(%12.2fc) par) & sum_table_surveys[p](fmt(%12.2fc))") ///
  nonumbers mgroups(none) mlabels(none) collabels(none) eqlabels("A" "B") unstack noobs ///
  varlabels( ///
    female       "Female" ///
    nonwhite     "Nonwhite" ///
    educ_1       "Less than prim. school" ///
    educ_2       "Primary school" ///
    educ_3       "High school" ///
    educ_4       "College or above" ///
    age_1        "Age 14-24" ///
    age_2        "Age 25-34" ///
    age_3        "Age 35-44" ///
    age_4        "Age 45-54" ///
    age_5        "Age 55-64" ///
    strata       "Strata" ///
    psu          "Primary Sampling Units" ///
    households   "Unique households" ///
    individuals  "Unique individuals" ///
    observations "Observations") ///
  refcat( ///
    female       "Gender and ethnicity (in %)" ///
    educ_1       "Education level (in %)" ///
    age_1        "Age group (in %)" ///
    strata       "Survey structure", nolabel)
    

* TABLE 1
*********

esttab using "${analysis}/results/tables/table1.tex", style(tex) fragment replace ///
  cells("sum_table_surveys[b](fmt(%12.2fc %12.2fc %12.2fc %12.2fc %12.2fc %12.2fc %12.2fc %12.2fc %12.2fc %12.2fc %12.2fc %12.0fc)) sum_table_surveys[se](fmt(2) par) & sum_table_surveys[p](fmt(3))") ///
  nonumbers mgroups(none) mlabels(none) collabels(none) eqlabels(none) unstack noobs ///
  varlabels( ///
    female       "\hspace{2ex} Female" ///
    nonwhite     "\hspace{2ex} Nonwhite" ///
    educ_1       "\hspace{2ex} Less than prim. school" ///
    educ_2       "\hspace{2ex} Primary school" ///
    educ_3       "\hspace{2ex} High school" ///
    educ_4       "\hspace{2ex} College or above" ///
    age_1        "\hspace{2ex} Age 14-24" ///
    age_2        "\hspace{2ex} Age 25-34" ///
    age_3        "\hspace{2ex} Age 35-44" ///
    age_4        "\hspace{2ex} Age 45-54" ///
    age_5        "\hspace{2ex} Age 55-64" ///
    strata       "\hspace{2ex} Strata" ///
    psu          "\hspace{2ex} Primary Sampling Units" ///
    households   "\hspace{2ex} Unique households" ///
    individuals  "\hspace{2ex} Unique individuals" ///
    observations "\hspace{2ex} Observations") ///
  refcat( ///
    female       "\textit{Gender and ethnicity (in %)}" ///
    educ_1       "\tabularnewline \textit{Education level (in %)}" ///
    age_1        "\tabularnewline \textit{Age group (in %)}" ///
    strata       "\tabularnewline \midrule \textit{Survey structure}", nolabel) ///
  substitute(% \% \hline "" "(.)" "" ") ." ")" "&" " & " "  " " " "  " " " "  " " " "  " " " "& ." "& $\cdot$")

  
* End of script
***************
cap log close